Loading dataset files

setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #set to current directory

labels=read.csv("labels.csv")
features=read.csv("tpm_expression.csv")
covariates=read.csv("tcga_pancancer_lung_cov.csv")
sgenes_index=read.csv("sgenes_index.csv") #saved from PAMR analysis

Formatting

Adjusting rows and columns names and NAs in covariates:

#labels
labels[,2]=factor(labels[,2]) #LABELS FACTORIZED ALL TOGETHER AT BEGINNING
colnames(labels)=c("ID","Label")
#features
row.names(features)=features[,1]
features=features[,-1]
#covariates
row.names(covariates)=covariates[,1]
covariates=as.data.frame(t(covariates[,-1]))
covariates[covariates == "[Not Applicable]" | covariates == "[Not Available]"] <- NA

Preprocessing

Train/test split:

#train/test split
set.seed(123)
train_index <- sample(nrow(labels), nrow(labels) * 0.5)

Eventual data transformation to bring data towards normality/stabilising variance:

#leave to all false ---> no transformation
#or select TRUE (at most one) ----> perform selected transformation
log2transf=FALSE
log10transf=FALSE
deseqVST=FALSE

if (log2transf) {
  features=log2(1+features) #add +1 to avoid Inf 
}

if (log10transf) {
  features=log10(1+features)
}

if (deseqVST) {
  features <- DESeq2::DESeqDataSetFromMatrix(countData=features, 
               colData=data.frame(condition=rep(1,ncol(traindata$x))),design = ~ 1)
  features <- vst(dds_train, blind=TRUE)
  features <- as.matrix(assay(vst_train))
}

Organizing data in the same format as in the PAMR analysis, but factor labels are converted to integer values starting from 0, like xgboost wants labels to be. Features are all numeric so they’re ok but feature matrix is transposed to get variables/genes in the columns and observations in the rows. The data is subsetted to the feature/genes selected in the PAMR analysis with a threshold of \(Th\), the boleaan vector indicating with TRUE the selected genes is in sgenes_index$sgenes_indexTh:

selected=sgenes_index$sgenes_indexTh #getting bolean vector with selected features

traindata=list()
traindata$ynum=as.integer(labels[train_index,2]) - 1
traindata$y=labels[train_index,2] 
traindata$x=t(as.matrix(features[selected,train_index]))
traindata$geneid=rownames(features[selected,])

testdata=list()
testdata$ynum=as.integer(labels[-train_index,2]) - 1
testdata$y=labels[-train_index,2]
testdata$x=t(as.matrix(features[selected,-train_index]))
testdata$geneid=rownames(features[selected,])

rm(features) #remove features to empty memory space

numclass=length(unique(traindata$ynum))
## [1] "Nr. training obvs: 545"
## [1] "Nr. test obvs: 546"
## [1] "Nr. of features (genes) selected: 24"

traindata-ynum integer (0,1,2) corresponds to the levels (LUAD,LUSC,NORM) of traindata-y

Fitting

library(xgboost)
## Warning: package 'xgboost' was built under R version 4.2.3
xgb_model <- xgboost(data = traindata$x, label=traindata$ynum, nrounds = 100, objective="multi:softprob", num_class=numclass, max_depth=10, verbose = FALSE)

Visualizing Results through silplot in train and test

To produce the silplot visualization, since XGBoost is not natively supported in classmap, we could use the supplementary function in classmapExt, vcr.custom.*. To that we feed just the given/true y and the extracted posterior probabilieties, this is enough to enable PAC computation and thus the silplot visual.

Produce the posteriors for train:

traindata$posteriors = predict(xgb_model,traindata$x,reshape=T)
colnames(traindata$posteriors) = levels(traindata$y)
traindata$ypred = apply(traindata$posteriors,1,function(x) colnames(traindata$posteriors)[which.max(x)])

Silplot on Train:

confusionMatrix(factor(traindata$ypred), traindata$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction LUAD LUSC NORM
##       LUAD  259    0    0
##       LUSC    0  238    0
##       NORM    0    0   48
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9933, 1)
##     No Information Rate : 0.4752     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: LUAD Class: LUSC Class: NORM
## Sensitivity               1.0000      1.0000     1.00000
## Specificity               1.0000      1.0000     1.00000
## Pos Pred Value            1.0000      1.0000     1.00000
## Neg Pred Value            1.0000      1.0000     1.00000
## Prevalence                0.4752      0.4367     0.08807
## Detection Rate            0.4752      0.4367     0.08807
## Detection Prevalence      0.4752      0.4367     0.08807
## Balanced Accuracy         1.0000      1.0000     1.00000
vcrtrainbase=vcr.custom.train(traindata$y, probs=traindata$posteriors) #feeding only true labels and posteriors

silplot(vcrtrainbase)
##  classNumber classLabel classSize classAveSi
##            1       LUAD       259       0.99
##            2       LUSC       238       0.99
##            3       NORM        48       0.99

Produce the posteriors for test set:

testdata$posteriors = predict(xgb_model,testdata$x,reshape=T)
colnames(testdata$posteriors) = levels(traindata$y)
testdata$ypred = apply(testdata$posteriors,1,function(x) colnames(testdata$posteriors)[which.max(x)])

Silplot on test:

confusionMatrix(factor(testdata$ypred), testdata$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction LUAD LUSC NORM
##       LUAD  223   11    0
##       LUSC   16  241    0
##       NORM    3    1   51
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9432          
##                  95% CI : (0.9204, 0.9611)
##     No Information Rate : 0.4634          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9025          
##                                           
##  Mcnemar's Test P-Value : 0.1773          
## 
## Statistics by Class:
## 
##                      Class: LUAD Class: LUSC Class: NORM
## Sensitivity               0.9215      0.9526     1.00000
## Specificity               0.9638      0.9454     0.99192
## Pos Pred Value            0.9530      0.9377     0.92727
## Neg Pred Value            0.9391      0.9585     1.00000
## Prevalence                0.4432      0.4634     0.09341
## Detection Rate            0.4084      0.4414     0.09341
## Detection Prevalence      0.4286      0.4707     0.10073
## Balanced Accuracy         0.9427      0.9490     0.99596
vcrtestbase=vcr.custom.newdata(ynew=testdata$y, probs=testdata$posteriors,   vcr.custom.train.out=vcrtrainbase)
silplot(vcrtestbase)
##  classNumber classLabel classSize classAveSi
##            1       LUAD       242       0.83
##            2       LUSC       253       0.89
##            3       NORM        51       0.98

Visualizing Results through classmap and mdscolorscale

Preliminary computations of needed quantities:

To enable the classmap plot we should devise a proper way to measure the distance of each observation \(i\) to each given class \(g\) (D(\(i\),\(g\))) according to the trained classifier view on the data. We should produce a “distance to class’ matrix for all observations in the train set and another for the ones in the test set. The matrices produced will be respectly feeded in vcr.custom.train and vcr.custom.newdata and will allow for Farness computation.

To produce D(\(i\),\(g\)) for our XGBoost model, which belongs to the large family of tree classifiers, we essentially build upon the same concept that Rousseuw and Raymakers established for basic classification tree algorithms (Supplementary Material section A.3 of Silhouttes and Quasi Residual Plots For Neural Nets and Tree-based Classifiers 2022 by Raymaekers and ROusseeuw).

To do that we start by computing pairwise dissimilarities between the points in the training set by the gower metric (here essentially a weighted euclidean since all feature are numeric weighted for the average importance of the features in the classifier as outputted in the Gain column after calling xgb.importance on our model:

traindata$importance=xgb.importance(model=xgb_model)
traindata$weight=rep(0,ncol(traindata$x)) #contains weight for all genes 
                                          #in proper order
names(traindata$weight)=colnames(traindata$x)
for (i in 1:nrow(traindata$importance)) {
  traindata$weight[as.character(traindata$importance[i,1])]=
  as.numeric(traindata$importance[i,2])
}
library(cluster)
traindata$pwd=as.matrix(daisy(traindata$x, metric="gower", weights = traindata$weight))
any(is.na(traindata$pwd)) #check if there is any NAs (could happen if a weight is exactly zero)
## [1] FALSE
#we compute the pairwise dissimilarities also among observations of test: 
#(will be needed later for mdscolorscale plot:)

testdata$pwd=as.matrix(daisy(testdata$x, metric="gower", weights = traindata$weight))
any(is.na(testdata$pwd)) #check if there is any NAs (could happen if a weight is exactly zero)
## [1] FALSE

Now for the training set we compute the ‘distance to class’ matrix. We take, as the distance of a generic observation \(i\) to a generic class \(g\), the median among the \(k=5\) smaller dissimilarities (because of locality nature of decision space of tree algorithms) between object \(i\) and the set of objects \(j\) actually belonging to class \(g\).

# Compute neighbors by sorting dissimilarities:
sortNgb <- t(apply(traindata$pwd, 1, order))[, -1] #contains indexes of nearest 
                                                   #points for each row 
sortDis <- t(apply(traindata$pwd, 1, sort))[, -1]  #contains dimmilarieties  
                                                   #values sorted
k=5 #neighbor to consider
yintv=as.numeric(traindata$y)
#create empty structure to fill:
traindata$distToClass <- matrix(rep(NA, nrow(traindata$x) * numclass), ncol = numclass) 
for (i in seq_len(nrow(traindata$x))) { #loop over all cases
  for (g in seq_len(numclass)) { #loop over classes
    ngbg <- which(yintv[sortNgb[i, ]] == g) #getting indexes of all in the same class
    if (length(ngbg) > k) {ngbg <- ngbg[seq_len(k)]} #getting the k nearer
    traindata$distToClass[i, g] <- median(sortDis[i, ngbg]) #take the median of the k nearer
  }
}

Now, to compute the ‘distance to class’ matrix for the cases in the test set we should only use the given considered case and the information from the train set. So each test case is taken separately and the pairwise distances are calculated on the set composed by the training observation plus the test case considered. Then the distance between the case and the generic class is computed exactly like before, that is by taking the median among the k=5 smaller dissimilarities between and the set of training objects actually belonging to class . (running this chunk may take a while cause of the nested loops)

testdata$newDistToClass <- matrix(rep(NA, nrow(testdata$x) * numclass), ncol = numclass)

for (i in 1:nrow(testdata$x)){
  testtrain=rbind(testdata$x[i,], traindata$x)
  yintv=rbind(as.numeric(testdata$label)[i],as.numeric(traindata$y)) #add also true int label of   test for structure consistencies (anyway after will be not considered)
  testtrainpwd=as.matrix(daisy(testtrain, metric="gower", weights = traindata$weight))
  sortNgb <- t(apply(testtrainpwd, 1, order))[1, -1] #taking first row we take the i test obvs
  sortDis <- t(apply(testtrainpwd, 1, sort))[1, -1] 
  
  for (g in seq_len(3)) { # loop over classes
    ngbg <- which(yintv[sortNgb] == g) #getting indexes of all in the considered
    if (length(ngbg) > k) {ngbg <- ngbg[seq_len(k)]} #getting the k nearer
    testdata$newDistToClass[i,g] <- median(sortDis[ngbg]) #take the median of the k nearer
  }
}

Produce visualizations for train set:

Now it is possible to run vcr.custom.train inputting also distToClass and allow for farness computation that in turn allow for classmap plots.

vcrtrain=vcr.custom.train(traindata$y, traindata$posteriors, distToClasses = traindata$distToClass)

par(mfrow=c(1,3))
classmap(vcrtrain, whichclass = 1)
classmap(vcrtrain, whichclass = 2)
classmap(vcrtrain, whichclass = 3)

We also previously computed the pairwise dissimilarities in the trained-xgboost model view. This enables the mdscolorscale plot:

mdscolorscale(vcrtrain,diss=traindata$pwd)

Proceeding to the visualizations in Test:

Now it is possible to run vcr.custom.newdata inputting also newDistToClass and allow for farness computation that in turn allow for classmap plots.

vcrtest=vcr.custom.newdata(ynew = testdata$y , probs = testdata$posteriors , vcr.custom.train.out = vcrtrain, newDistToClasses = testdata$newDistToClass)

par(mfrow=c(1,3))
classmap(vcrtest, whichclass = 1)
classmap(vcrtest, whichclass = 2)
classmap(vcrtest, whichclass = 3)

We also previously computed the pairwise dissimilarities in the trained-xgboost model view on the whole set of test observations. This enables the mdscolorscale plot:

#with pairwise distance only among test dataset
mdscolorscale(vcrtest, diss=testdata$pwd)